{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('ms-en/translated-trainset-parliament.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected = ['PERTANYAAN-PERTANYAAN JAWAB LISAN', 'PENGGAL KEEMPAT', 'PUSAT JAGAAN BERDAFTAR',\n",
    "           'BILANGAN PUSAT JAGAAN', 'pewan']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50460\n"
     ]
    }
   ],
   "source": [
    "selected, reject = [], []\n",
    "for row in data:\n",
    "    if any([r.lower() in row[0].lower() for r in rejected]):\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    s = row[0]\n",
    "    if (sum(c.isdigit() for c in s) / len(s)) > 0.15:\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    if sum(c.isalpha() for c in s) == 0:\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    selected.append(row)\n",
    "    \n",
    "print(len(selected))\n",
    "x_parliament, y_parliament = list(zip(*selected))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "with open('ms-en/ubuntu-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "X, Y = list(zip(*data))\n",
    "X = list(X)\n",
    "Y = list(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ms-en/qed-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "x, y = list(zip(*data))\n",
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ms-en/tanzil-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "x, y = list(zip(*data))\n",
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ms-en/translated-3200000.json',\n",
       " 'ms-en/translated-700000.json',\n",
       " 'ms-en/translated-2100000.json',\n",
       " 'ms-en/translated-3300000.json',\n",
       " 'ms-en/translated-2300000.json',\n",
       " 'ms-en/translated-2000000.json',\n",
       " 'ms-en/translated-600000.json',\n",
       " 'ms-en/translated-900000.json',\n",
       " 'ms-en/translated-1000000.json',\n",
       " 'ms-en/translated-1100000.json',\n",
       " 'ms-en/translated-1900000.json',\n",
       " 'ms-en/translated-500000.json',\n",
       " 'ms-en/translated-1400000.json',\n",
       " 'ms-en/translated-1500000.json',\n",
       " 'ms-en/translated-2600000.json',\n",
       " 'ms-en/translated-200000.json',\n",
       " 'ms-en/translated-2900000.json',\n",
       " 'ms-en/translated-3400000.json',\n",
       " 'ms-en/translated-3500000.json',\n",
       " 'ms-en/translated-2800000.json',\n",
       " 'ms-en/translated-300000.json',\n",
       " 'ms-en/translated-2500000.json',\n",
       " 'ms-en/translated-3100000.json',\n",
       " 'ms-en/translated-1300000.json',\n",
       " 'ms-en/translated-2400000.json',\n",
       " 'ms-en/translated-100000.json',\n",
       " 'ms-en/translated-1600000.json',\n",
       " 'ms-en/translated-2700000.json',\n",
       " 'ms-en/translated-0.json',\n",
       " 'ms-en/translated-800000.json',\n",
       " 'ms-en/translated-1800000.json',\n",
       " 'ms-en/translated-2200000.json',\n",
       " 'ms-en/translated-1200000.json',\n",
       " 'ms-en/translated-1700000.json',\n",
       " 'ms-en/translated-3000000.json',\n",
       " 'ms-en/translated-400000.json']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "translated = glob('ms-en/translated*0.json')\n",
    "translated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in translated:\n",
    "    with open(file) as fopen:\n",
    "        data = json.load(fopen)\n",
    "\n",
    "    x, y = list(zip(*data))\n",
    "    X.extend(x)\n",
    "    Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3808696, 3808696)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(X), len(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3808696/3808696 [01:08<00:00, 55384.54it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = unidecode(string).replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string\n",
    "\n",
    "filtered_X, filtered_Y = [], []\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    X[i] = cleaning(X[i])\n",
    "    Y[i] = cleaning(Y[i])\n",
    "    if len(X[i]) and len(Y[i]):\n",
    "        filtered_X.append(X[i])\n",
    "        filtered_Y.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ms-en/part1.txt', 'ms-en/part3.txt', 'ms-en/part2.txt', 'ms-en/part4.txt']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parts = glob('ms-en/part*.txt')\n",
    "parts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Sertakan Gmail, Google Docs, Google+, YouTube dan Picasa',\n",
       " 'Includes Gmail, Google Docs, Google+, YouTube and Picasa')"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[0], Y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3990, 3990)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ls, rs = [], []\n",
    "\n",
    "for part in parts:\n",
    "    with open(part) as fopen:\n",
    "        data = fopen.read().split('\\n')\n",
    "\n",
    "    for row in data:\n",
    "        if not len(row.strip()):\n",
    "            continue\n",
    "        splitted = row.split('<>')\n",
    "        if len(splitted) != 2:\n",
    "            continue\n",
    "        l, r = splitted\n",
    "        l = cleaning(l)\n",
    "        r = cleaning(r)\n",
    "        if len(l) and len(r):\n",
    "            ls.append(l)\n",
    "            rs.append(r)\n",
    "    \n",
    "len(ls), len(rs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.extend(ls)\n",
    "Y.extend(rs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3812686, 3812686)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(X), len(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ms-en.json', 'w') as fopen:\n",
    "    json.dump({'left': X, 'right': Y}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('english-ms-en.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(Y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('malay-ms-en.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(X))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
